import pandas as pd
import numpy as np #For mathematical calculations
import seaborn as sns #For data visualization
import matplotlib.pyplot as plt #For plotting graphs
%matplotlib inline
cData=pd.read_csv('bank.csv')
cData_copy=cData.copy() #Making a copy
cData.head()
print(cData.columns)
print(cData.shape)
Target Variable is 'Personal Loan' No need for ‘ID’, ‘ZIP_Code’ & ‘Experience’ columns for further analysis since ‘ID’ and ‘ZIP_Code’ are just numbers of series & ‘Experience’ is highly correlated with ‘Age’.
!pip install pandas_profiling
import pandas_profiling
cData.profile_report()
cData.drop('ID',axis=1,inplace=True)
cData.drop('ZIP Code',axis=1,inplace=True)
# Personal loan buyers and their education level to get the proportion of purchased and not purchased loans.
edu=pd.crosstab(cData['Education'],cData['Personal Loan'])
edu.div(edu.sum(1).astype(float),axis=0).plot(kind='bar' ,stacked=True)
print('cross tabulation can be given as:',' \n' ,edu)
print('cross tabulation in percentages can be given as:','\n',edu.div(edu.sum(1).astype(float),axis=0))
We can infer that customers who are more educated have a higher probability of buying personal loans.
# Plot between the personal loan buyers and their family size
family=pd.crosstab(cData['Family'],cData['Personal Loan'])
family.div(edu.sum(1).astype(float),axis=0).plot(kind='bar' ,stacked=True)
print('cross tabulation can be given as:',' \n' ,family)
print('cross tabulation in percentages can be given as:','\n',family.div(family.sum(1).astype(float),axis=0))
The number of family members not significantly affect probability.
# Personal loan buyers and their education level which will give us the proportion of purchased and not purchased loansersonal loan buyers who use or doesn’t use a credit card issued by UniversalBank
cd=pd.crosstab(cData['CreditCard'],cData['Personal Loan'])
cd.div(cd.sum(1).astype(float),axis=0).plot(kind='bar' ,stacked=True)
print('cross tabulation can be given as:',' \n' ,cd)
print('cross tabulation in percentages can be given as:','\n',cd.div(cd.sum(1).astype(float),axis=0))
The customer who uses or doesn’t use a credit card issued by UniversalBank doesn’t seem to affect the probability of buying a personal loan.
# Personal loan buyer’s customer who uses or doesn’t use internet banking facilities
on=pd.crosstab(cData['Online'],cData['Personal Loan'])
on.div(on.sum(1).astype(float),axis=0).plot(kind='bar' ,stacked=True)
print('cross tabulation can be given as:',' \n' ,on)
print('cross tabulation in percentages can be given as:','\n',on.div(on.sum(1).astype(float),axis=0))
The customer who uses or doesn’t use internet banking facilities seems to not affect the probability of buying personal loans.
# Personal loan buyer’s customer who has or doesn’t have a securities account with the bank
sec=pd.crosstab(cData['Securities Account'],cData['Personal Loan'])
sec.div(sec.sum(1).astype(float),axis=0).plot(kind='bar' ,stacked=True)
print('cross tabulation can be given as:',' \n' ,sec)
print('cross tabulation in percentages can be given as:','\n',sec.div(sec.sum(1).astype(float),axis=0))
The customers who have or don’t have securities account with the bank do not affect the probability of buying a personal loan.
# Age vs Personal Loan
cData.groupby('Personal Loan')['Age'].mean().plot(kind='bar')
#Using bins bins for the applicant age variable based on the values in it and analyze the corresponding loan status for each bin.
#To make the bins, we had used pandas.cut() which is used to segment and sort data values into bins.
#This function is also useful for going from a continuous variable to a categorical variable.
bins=[23,35,45,55,67]
group=['Young','Youth','Medium','Old']
cData['Age_bin']=pd.cut(cData['Age'],bins,labels=group)
age=pd.crosstab(cData['Age_bin'],cData['Personal Loan'])
age.div(age.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True)
It can be inferred that the Applicant age does not affect the chances of buying the personal loan
# Personal loan buyer's average spending on credit cards per month
cData.groupby('Personal Loan')['CCAvg'].mean().plot(kind='bar')
It can be clearly seen that applicants who spend more on credit cards are more viable to buy personal loans.
# Income of the customer affect the possibility of a liability customer
cData.groupby('Personal Loan')['Income'].mean().plot(kind='bar')
It can be clearly seen that the customers with high incomes are more feasible to buy the personal loan
#Using the cData_copy dataset for plotting a graph between personal loan and experience
cData_copy.groupby('Personal Loan')['Experience'].mean().plot(kind='bar')
‘Experience’ is highly correlated with ‘Age’
cData.isnull().sum()
from sklearn.model_selection import train_test_split
X = cData.drop('Personal Loan',axis=1) # Predictor feature columns (8 X m)
Y = cData['Personal Loan'] # Predicted class (1=True, 0=False) (1 X m)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# 1 is just any random seed number
x_train.head()
print("{0:0.2f}% data is in training set".format((len(x_train)/len(cData.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_test)/len(cData.index)) * 100))
print(x_train.shape)
print(x_test.shape)
cData.corr() # It will show correlation matrix
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
# Fit the model on train
model = LogisticRegression(solver="liblinear")
model.fit(x_train, y_train)
#predict on test
y_predict = model.predict(x_test)
coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
print(coef_df)
model_score = model.score(x_test, y_test)
print(model_score)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)
print("Classification Report")
print(metrics.classification_report(y_test, y_predict, labels=[1, 0]))
Findings: It is found that the model accuracy of Logistic Regression is 94.
This means the model gives correct result 94% of the time.
from scipy.stats import zscore
# convert the features into z scores as we do not know what units / scales were used and store them in new dataframe
# It is always adviced to scale numeric attributes in models that calculate distances.
XScaled = X.apply(zscore) # convert all attributes to Z scale
XScaled.describe()
x_train,x_test,y_train,y_test = train_test_split(XScaled, Y, test_size=0.30,random_state=1)
from sklearn.neighbors import KNeighborsClassifier
NNH = KNeighborsClassifier(n_neighbors= 5 , weights = 'distance')
# Call Nearest Neighbour algorithm
NNH.fit(x_train, y_train)
predicted_labels = NNH.predict(x_test)
NNH.score(x_test, y_test)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)
print("Classification Report")
print(metrics.classification_report(y_test, predicted_labels, labels=[1, 0]))
The model accuracy of KNN Classifier is 95.
Thos means the model gives correct result 95% of time
from sklearn.naive_bayes import GaussianNB # using Gaussian algorithm from Naive Bayes
# create the model
diab_model = GaussianNB()
diab_model.fit(x_train, y_train.ravel())
diab_train_predict = diab_model.predict(x_train)
from sklearn import metrics
print("Model Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, diab_train_predict)))
print()
diab_test_predict = diab_model.predict(x_test)
from sklearn import metrics
print("Model Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, diab_test_predict)))
print()
print("Confusion Matrix")
cm=metrics.confusion_matrix(y_test, diab_test_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)
print("Classification Report")
print(metrics.classification_report(y_test, diab_test_predict, labels=[1, 0]))
The model accuracy of Naive Bayes classifier is 89.
This means model gives correct result 89% of the time.
Out of the models created , it is clear from the scores of the models that KNN Model is the best model among Logistic Regresion Model and Naive Bayes Model.
Reasons why KNN Model performs better than the rest is because:
1.KNN doesn't make prior assumption like naive bayes theorem.
2.In logistic regression, assumption of linearity is used thus some
points might be away from the line where as in KNN distance from nearby
points is calculated hence more accuracy is gained.